# -*- coding: utf-8 -*-
# filename:_crawl_web_xpath.py

# import requests
from urllib2 import urlopen
from urllib import urlretrieve
from lxml import etree
import re
import os
import threading

page = '''
<html>
<head><title>WebAgent</title></head>
<body>
<form action="#" method="get">
Website: <input type="text" name="website" />
<button type="submit" value="Submit">Submit</button>
<button type="reset" value="Reset">Reset</button>
</form>
</body>
</html>
'''

print 'START'

urls = ['www.csdn.net', 'http://www.csdn.net', 'http://blog.csdn.net/hxiaohai/article/details/78390124', "www.haydnliao.win", "www.baidu.com"]
rUrls = set()
for url in urls:
    if re.search(r'([\w-]+\.)+[\w-]+(/[\w./?%&=]*)?$', url) is not None:
        if re.search(r'^http://.*', url) is not None:
            url = url[len("http://"):]
        # print url
        rUrls.add(url)
# print rUrls

# TODO:需要处理网址跳转问题
# rUrl = rUrls.pop()
rUrl = "www.csdn.net"
# for rUrl in rUrls:
# TODO:不支持中文域名
uc = urlopen("http://" + rUrl)
ps = uc.read()
uc.close()

html = etree.HTML(ps)
nodes = html.xpath("//*[@href | @src]")  # most of @href are .html and .css, @src are .js, .jpg and .png
sp = "./_{}/".format(rUrl.split('/')[0].replace('.', '_'))  # 保存路径不设置子目录
# sp = "./_{}/".format(rUrl.replace('.', '_'))
if not os.path.exists(sp):
    os.makedirs(sp)  # 新建文件存储目录
sfThreadList = []  # 线程列表
for node in nodes:
    keys = node.keys()
    values = node.values()
    for index in range(len(keys)):
        if keys[index] == "href" or keys[index] == "src":
            # TODO:正则表达式存在问题，cn.bing.com
            value = re.search(ur'[_\w\.-]+(/[_\w-]+)*/[\u4e00-\u9fa5 | -_=,&\w]+\.\w+(\?[&%=\w-]+)?$', values[index])  # 匹配普通文件地址、带？文件地址、带=，&文件地址、带中文文件地址
            if value:
                tValue = re.search(ur'^[_\w-]+(/[_\w-]+)*/[\u4e00-\u9fa5 | -_=,&\w]+\.\w+', values[index]) # 匹配不带完整网址的路径，不包含带？路径
                if tValue is not None:
                    value = rUrl + '/' + tValue.group() # 补全路径
                else:
                    value = value.group().split('?')[0] # 带完整网址的路径除去问号
                    # print values[index], value
                try:
                    value = value.encode('utf-8')
                    values[index] = values[index].encode('utf-8')
                    fn = value.split('/')[-1]
                    fp = sp + fn
                    print fn, fp
                    # urlretrieve("http://" + value, fp)  # 保存文件
                    sfThreadList.append(threading.Thread(target=urlretrieve, args=("http://" + value, fp))) # 多线程保存文件
                    # ps = ps.replace(values[index], fp)  # 修改HTML文件里的路径
                    ps = ps.replace(values[index], fn)  # 修改HTML文件里的路径
                except Exception, err:
                    print ">>>Save>>>", value, ">>>", err
# TODO：多线程下载影响index.html文件内容
with open("{}index.html".format(sp), "wb") as fs:
    fs.write(page + ps)
    print "index.html"
try:
    for t in sfThreadList:
        t.start()
    for t in sfThreadList:
        t.join()
except Exception, err:
    print ">>>Thread>>>", err

print 'END'
